# Package names packages <- c('MASS', 'knitr',
# 'readr', 'tidyr', 'dplyr', 'ggplot2', 'e1071',
# 'moments', 'corrplot', 'Hmisc',
# 'PerformanceAnalytics', 'mice', 'needs') #
# Install packages not yet installed
# installed_packages <- packages %in%
# rownames(installed.packages()) if
# (any(installed_packages == FALSE)) {
# install.packages(packages[!installed_packages])
# } # Packages loading invisible(lapply(packages,
# library, character.only = TRUE))
# prioritize(dplyr)
library(needs)
needs("dplyr", "MASS", "knitr", "readr", "tidyr", "ggplot2",
"e1071", "moments", "corrplot", "Hmisc", "PerformanceAnalytics",
"mice", "car", "glmnet")
prioritize(dplyr)
The data set is collection The World Bank Data, the variables of interest are extracted from the raw data files and combined into a single data frame for analysis. The final data set includes:
Data imports and combining:
# helper functions
importWDI <- function(filepath, value_name) {
df <- read_csv(filepath, skip = 4)
colnames(df) <- tolower(gsub(" ", ".", colnames(df)))
df <- df %>%
pivot_longer(5:ncol(.), names_to = "year",
values_to = "value") %>%
filter(!is.null(value) & !is.na(value)) %>%
mutate(country.code = factor(country.code),
country.name = factor(country.name), year = as.numeric(year)) %>%
select(country.code, country.name, year, value)
colnames(df)[4] <- value_name
df
}
importRegionClass <- function(filepath) {
df <- read_csv(filepath, skip = 4)
colnames(df) <- tolower(gsub(" ", ".", colnames(df)))
df %>%
mutate(country.name = factor(country.name),
region = factor(region)) %>%
select(country.name, reg = region)
}
importIncomeClass <- function(filepath) {
df <- read_csv(filepath, skip = 4)
colnames(df) <- tolower(gsub(" ", ".", colnames(df)))
df %>%
pivot_longer(3:ncol(.), names_to = "year",
values_to = "income") %>%
filter(!is.null(income) & !is.na(income)) %>%
mutate(country.code = factor(country.code),
country.name = factor(country.name), year = as.numeric(year),
income = factor(income)) %>%
select(country.code, country.name, year, income)
}
# import data
setwd("../data")
poverty.headcount <- importWDI("poverty.headcount.215dollar.csv",
"pov")
mpi <- importWDI("mpi.csv", "mpi")
education.expenditure.total <- importWDI("total.education.expenditure.csv",
"edu.total")
education.expenditure.primary <- importWDI("primary.education.expenditure.csv",
"edu.pri")
education.expenditure.secondary <- importWDI("secondary.education.expenditure.csv",
"edu.sec")
education.expenditure.tertiary <- importWDI("tertiary.education.expenditure.csv",
"edu.ter")
health.expenditure <- importWDI("health.expenditure.csv",
"hlth")
military.expenditure <- importWDI("military.expenditure.csv",
"mil")
fdi <- importWDI("fdi.csv", "fdi")
labour.force.participation <- importWDI("labour.force.participation.csv",
"lbr.part")
unemployment.rate <- importWDI("unemployment.csv",
"unemp")
population.growth <- importWDI("population.growth.csv",
"pop.gwth.total")
rural.population.growth <- importWDI("rural.population.growth.csv",
"pop.gwth.rural")
urban.population.growth <- importWDI("urban.population.growth.csv",
"pop.gwth.urban")
gdp.deflator <- importWDI("gdp.deflator.csv", "gdp.dflt")
gender.equality <- importWDI("gender.equality.csv",
"gdr.eql")
gross.capital.formation <- importWDI("gross.capital.formation.csv",
"gcf")
trade <- importWDI("trade.csv", "trade")
region.class <- importRegionClass("region.class.csv")
income.class <- importIncomeClass("income.class.csv")
gdp.pc <- importWDI("gdp.pc.csv", "gdp.pc")
setwd("../src")
We found that the data sets collected from World Bank’s Data helpdesk and The World Bank’s Data have different naming convention for certain countries (e.g. “Czechia” vs. “Czechnia Republic”). So we need to rename these countries to avoid some error when joining.
Furthermore, WDI’s data sets rate also account for non-country (e.g. country.name = “Low income” or “South Asia”). These special groups are not in our scope of interest, which is national, so we eliminate them.
# using poverty.headcount as a naming standard
# (as other data from WDI also use this
# convention) join a subset of data to process
# the names
d <- poverty.headcount %>%
select(country.name) %>%
mutate(inPov = T) %>%
full_join(income.class %>%
select(country.name) %>%
mutate(inIncome = T), by = "country.name") %>%
full_join(region.class %>%
select(country.name) %>%
mutate(inReg = T), by = "country.name") %>%
mutate(inPov = !is.na(inPov), inIncome = !is.na(inIncome),
inReg = !is.na(inReg))
d
First, remove special economic groups from
poverty.headcount. We figured these regions will not appear
in income.class or region.class, so we might
find something from looking at the countries only
appear in poverty.headcount.
d %>%
filter(inPov & (!inIncome | !inReg)) %>%
distinct(country.name)
Lucky! We can look through these 18 results and compose a list of special regions.
spec.reg <- c("Fragile and conflict affected situations",
"IDA total", "World", "East Asia & Pacific", "Europe & Central Asia",
"Latin America & Caribbean", "Middle East & North Africa",
"South Asia", "Sub-Saharan Africa", "Low income",
"Low & middle income", "Lower middle income", "Upper middle income",
"High income")
Then, we rename those countries with inconsistent naming convention. Since we should only care about countries whose poverty headcount is available, reusing the list generated above, we can identify:
# mapping standard name and variation
nameMap <- tibble(standard = c("Cote d'Ivoire", "Czechia",
"Czechia", "Curacao", "Turkiye", "Turkiye", "Sao Tome and Principe"),
variation = c("Côte d'Ivoire", "Czechoslovakia",
"Czech Republic", "Curaçao", "Turkey", "Türkiye",
"São Tomé and Príncipe"))
correctName <- function(name) {
tibble(name = name) %>%
left_join(nameMap, by = c(name = "variation")) %>%
mutate(standard = ifelse(is.na(standard), name,
standard)) %>%
select(standard) %>%
pull()
}
orig.name <- c("Vietnam", "China", "Turkey", "Czechia Republic")
correctName(orig.name)
## [1] "Vietnam" "China" "Turkiye" "Czechia Republic"
Let’s test this out!
d <- poverty.headcount %>%
# correct name here
mutate(country.name = correctName(country.name)) %>%
select(country.name) %>%
mutate(inPov = T) %>%
full_join(income.class %>%
# correct name here
mutate(country.name = correctName(country.name)) %>%
select(country.name) %>%
mutate(inIncome = T), by = "country.name") %>%
full_join(region.class %>%
# correct name here
mutate(country.name = correctName(country.name)) %>%
select(country.name) %>%
mutate(inReg = T), by = "country.name") %>%
filter(!(country.name %in% spec.reg)) %>%
mutate(inPov = !is.na(inPov), inIncome = !is.na(inIncome), inReg = !is.na(inReg))
# countries not in region list, but is in Pov list
d %>%
filter(!inReg & inPov) %>%
distinct(country.name) %>%
nrow()
## [1] 0
# countries not in income list, but is in Pov list
d %>%
filter(!inIncome & inPov) %>%
distinct(country.name) %>%
nrow()
## [1] 0
We are pretty confident that there’s no inconsistent naming left unprocessed in the data sets.
# Rename countries in all data sets.
poverty.headcount <- poverty.headcount %>%
mutate(country.name = correctName(country.name))
mpi <- mpi %>%
mutate(country.name = correctName(country.name))
education.expenditure.total <- education.expenditure.total %>%
mutate(country.name = correctName(country.name))
education.expenditure.primary <- education.expenditure.primary %>%
mutate(country.name = correctName(country.name))
education.expenditure.secondary <- education.expenditure.secondary %>%
mutate(country.name = correctName(country.name))
education.expenditure.tertiary <- education.expenditure.tertiary %>%
mutate(country.name = correctName(country.name))
health.expenditure <- health.expenditure %>%
mutate(country.name = correctName(country.name))
military.expenditure <- military.expenditure %>%
mutate(country.name = correctName(country.name))
fdi <- fdi %>%
mutate(country.name = correctName(country.name))
labour.force.participation <- labour.force.participation %>%
mutate(country.name = correctName(country.name))
unemployment.rate <- unemployment.rate %>%
mutate(country.name = correctName(country.name))
population.growth <- population.growth %>%
mutate(country.name = correctName(country.name))
rural.population.growth <- rural.population.growth %>%
mutate(country.name = correctName(country.name))
urban.population.growth <- urban.population.growth %>%
mutate(country.name = correctName(country.name))
gdp.deflator <- gdp.deflator %>%
mutate(country.name = correctName(country.name))
gender.equality <- gender.equality %>%
mutate(country.name = correctName(country.name))
gross.capital.formation <- gross.capital.formation %>%
mutate(country.name = correctName(country.name))
trade <- trade %>%
mutate(country.name = correctName(country.name))
region.class <- region.class %>%
mutate(country.name = correctName(country.name))
income.class <- income.class %>%
mutate(country.name = correctName(country.name))
gdp.pc <- gdp.pc %>%
mutate(country.name = correctName(country.name))
Join the data
countries <- poverty.headcount %>%
# We used a full join here so we can conduct
# a separate analysis on mpi later
full_join(mpi, by = c("country.name", "country.code",
"year")) %>%
left_join(income.class, c("country.name", "country.code",
"year")) %>%
left_join(region.class, by = "country.name") %>%
left_join(education.expenditure.total, by = c("country.name",
"country.code", "year")) %>%
left_join(education.expenditure.primary, by = c("country.name",
"country.code", "year")) %>%
left_join(education.expenditure.secondary, by = c("country.name",
"country.code", "year")) %>%
left_join(education.expenditure.tertiary, by = c("country.name",
"country.code", "year")) %>%
left_join(health.expenditure, by = c("country.name",
"country.code", "year")) %>%
left_join(military.expenditure, by = c("country.name",
"country.code", "year")) %>%
left_join(fdi, by = c("country.name", "country.code",
"year")) %>%
left_join(labour.force.participation, by = c("country.name",
"country.code", "year")) %>%
left_join(unemployment.rate, by = c("country.name",
"country.code", "year")) %>%
left_join(population.growth, by = c("country.name",
"country.code", "year")) %>%
left_join(rural.population.growth, by = c("country.name",
"country.code", "year")) %>%
left_join(urban.population.growth, by = c("country.name",
"country.code", "year")) %>%
left_join(gdp.deflator, by = c("country.name",
"country.code", "year")) %>%
left_join(gender.equality, by = c("country.name",
"country.code", "year")) %>%
left_join(gross.capital.formation, by = c("country.name",
"country.code", "year")) %>%
left_join(trade, by = c("country.name", "country.code",
"year")) %>%
left_join(gdp.pc, by = c("country.name", "country.code",
"year")) %>%
# filter special groups
filter(!(country.name %in% spec.reg))
Data preview
head(countries)
str(countries)
## tibble [1,901 × 24] (S3: tbl_df/tbl/data.frame)
## $ country.code : Factor w/ 272 levels "AGO","ALB","ARE",..: 1 1 1 2 2 2 2 2 2 2 ...
## $ country.name : chr [1:1901] "Angola" "Angola" "Angola" "Albania" ...
## $ year : num [1:1901] 2000 2008 2018 1996 2002 ...
## $ pov : num [1:1901] 21.4 14.6 31.1 0.5 1.1 0.6 0.2 0.6 1 0.1 ...
## $ mpi : num [1:1901] NA NA NA NA NA NA NA NA NA NA ...
## $ income : Factor w/ 4 levels "H","L","LM","UM": 2 3 3 3 3 3 3 4 4 4 ...
## $ reg : Factor w/ 7 levels "East Asia & Pacific",..: 7 7 7 2 2 2 2 2 2 2 ...
## $ edu.total : num [1:1901] 2.61 2.69 2.04 3.08 3.12 ...
## $ edu.pri : num [1:1901] NA NA NA NA NA ...
## $ edu.sec : num [1:1901] NA NA NA NA NA ...
## $ edu.ter : num [1:1901] NA NA NA NA NA ...
## $ hlth : num [1:1901] 1.91 3.32 2.54 NA 6.91 ...
## $ mil : num [1:1901] 6.39 3.57 1.87 1.38 1.32 ...
## $ fdi : num [1:1901] 8.79e+08 1.68e+09 -6.46e+09 9.01e+07 1.35e+08 ...
## $ lbr.part : num [1:1901] NA NA NA 38.8 59.6 ...
## $ unemp : num [1:1901] NA NA NA 12.3 15.8 ...
## $ pop.gwth.total: num [1:1901] 3.277 3.711 3.276 -0.622 -0.3 ...
## $ pop.gwth.rural: num [1:1901] 0.921 1.91 1.338 -1.546 -2.169 ...
## $ pop.gwth.urban: num [1:1901] 5.682 5.02 4.312 0.812 2.181 ...
## $ gdp.dflt : num [1:1901] 418.02 19.37 28.17 38.17 3.65 ...
## $ gdr.eql : num [1:1901] NA 3 NA NA NA 4 NA NA NA NA ...
## $ gcf : num [1:1901] 30.5 30.8 17.9 18.1 35.3 ...
## $ trade : num [1:1901] 152.5 121.4 66.4 44.9 68.5 ...
## $ gdp.pc : num [1:1901] 557 4081 2525 1010 1425 ...
summary(countries)
## country.code country.name year pov
## BRA : 36 Length:1901 Min. :1967 Min. : 0.00
## CRI : 34 Class :character 1st Qu.:2002 1st Qu.: 0.20
## ARG : 32 Mode :character Median :2009 Median : 1.50
## USA : 32 Mean :2007 Mean :10.04
## DEU : 30 3rd Qu.:2014 3rd Qu.:11.60
## HND : 30 Max. :2021 Max. :91.50
## (Other):1707 NA's :58
## mpi income reg edu.total
## Min. : 2.37 H :644 East Asia & Pacific :167 Min. : 1.033
## 1st Qu.:18.30 L :253 Europe & Central Asia :883 1st Qu.: 3.522
## Median :24.80 LM :501 Latin America & Caribbean :416 Median : 4.519
## Mean :27.06 UM :438 Middle East & North Africa:107 Mean : 4.582
## 3rd Qu.:33.30 NA's: 65 North America : 50 3rd Qu.: 5.457
## Max. :74.20 South Asia : 61 Max. :15.750
## NA's :1446 Sub-Saharan Africa :217 NA's :596
## edu.pri edu.sec edu.ter hlth
## Min. : 0.6578 Min. : 2.724 Min. : 0.00 Min. : 1.718
## 1st Qu.:24.0269 1st Qu.:30.138 1st Qu.:16.61 1st Qu.: 5.151
## Median :30.4730 Median :35.713 Median :20.59 Median : 6.914
## Mean :31.6633 Mean :35.630 Mean :20.96 Mean : 6.975
## 3rd Qu.:38.3324 3rd Qu.:41.380 3rd Qu.:25.14 3rd Qu.: 8.565
## Max. :70.0950 Max. :71.587 Max. :50.44 Max. :17.733
## NA's :1090 NA's :1094 NA's :963 NA's :464
## mil fdi lbr.part unemp
## Min. : 0.000 Min. :-3.444e+11 Min. :30.50 Min. : 0.250
## 1st Qu.: 1.042 1st Qu.: 2.979e+08 1st Qu.:56.17 1st Qu.: 4.513
## Median : 1.468 Median : 1.709e+09 Median :61.18 Median : 6.880
## Mean : 1.787 Mean : 1.598e+10 Mean :60.78 Mean : 8.145
## 3rd Qu.: 2.103 3rd Qu.: 9.821e+09 3rd Qu.:65.49 3rd Qu.:10.078
## Max. :19.385 Max. : 7.338e+11 Max. :93.00 Max. :49.700
## NA's :105 NA's :17 NA's :320 NA's :267
## pop.gwth.total pop.gwth.rural pop.gwth.urban gdp.dflt
## Min. :-3.6295 Min. :-8.56066 Min. :-4.078 Min. : -26.300
## 1st Qu.: 0.2656 1st Qu.:-0.85664 1st Qu.: 0.510 1st Qu.: 1.696
## Median : 1.0318 Median :-0.02461 Median : 1.484 Median : 3.865
## Mean : 1.0565 Mean : 0.00083 Mean : 1.691 Mean : 15.831
## 3rd Qu.: 1.7761 3rd Qu.: 0.96362 3rd Qu.: 2.657 3rd Qu.: 8.537
## Max. : 5.6145 Max. : 4.59686 Max. :13.805 Max. :3333.585
## NA's :14 NA's :13 NA's :18
## gdr.eql gcf trade gdp.pc
## Min. :1.500 Min. : 0.00 Min. : 1.378 Min. : 119.7
## 1st Qu.:3.000 1st Qu.:19.65 1st Qu.: 51.063 1st Qu.: 1927.9
## Median :3.500 Median :22.73 Median : 73.496 Median : 6032.1
## Mean :3.592 Mean :23.88 Mean : 84.429 Mean : 15219.5
## 3rd Qu.:4.000 3rd Qu.:26.76 3rd Qu.:105.462 3rd Qu.: 21490.4
## Max. :5.000 Max. :69.48 Max. :380.104 Max. :123678.7
## NA's :1635 NA's :72 NA's :57 NA's :8
There’s no NA in reg, which is a sign that all naming in
the data is remedied. There’s some expected NAs in income
and pov, as these data are collected by year. There’s a
substantial amount of missing data in mpi, as this is a
relative new concept. We will address the nature, and processing of
missing data in the next sections.
As observed from the summary above, the data set contains a lot of missing values in some of the variables.
mean(is.na(countries))
## [1] 0.1819656
About 19% of the data set is missing.
nCompleteObs <- sum(complete.cases(countries))
print(paste("No. of complete cases:", nCompleteObs))
## [1] "No. of complete cases: 3"
There are only 3 complete cases where all the variable is available. This is nowhere near acceptable to conduct any meaningful analysis. Therefore, we need to eliminate some variables for a more balance data set.
missings <- colMeans(is.na(countries))
ggplot(mapping = aes(x = names(missings), y = missings,
fill = missings < 0.35)) + geom_bar(stat = "identity") +
ggtitle("Missing rate") + xlab("variables") + ylab("% missing") +
theme(axis.text.x = element_text(size = 9, angle = 90))
missings[missings > 0.35]
## mpi edu.pri edu.sec edu.ter gdr.eql
## 0.7606523 0.5733824 0.5754866 0.5065755 0.8600736
There are 5 variables with missing rate >35%.
expenditure in primary, secondary, and tertiary edication can be very useful and relevant information to predict poverty reduction (Akbar et al. 2019). However, we would like to exclude these variables from some first analyses to make use of the richer set of data. We can conduct a separate analysis with these variable to gain more insight.
# variables with high missing rate
hMiss <- names(missings[missings > 0.35])
# exclude these variables in countries1
countries1 <- countries %>%
select(!hMiss) %>%
filter(!is.na(pov))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(hMiss)` instead of `hMiss` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
str(countries1)
## tibble [1,843 × 19] (S3: tbl_df/tbl/data.frame)
## $ country.code : Factor w/ 272 levels "AGO","ALB","ARE",..: 1 1 1 2 2 2 2 2 2 2 ...
## $ country.name : chr [1:1843] "Angola" "Angola" "Angola" "Albania" ...
## $ year : num [1:1843] 2000 2008 2018 1996 2002 ...
## $ pov : num [1:1843] 21.4 14.6 31.1 0.5 1.1 0.6 0.2 0.6 1 0.1 ...
## $ income : Factor w/ 4 levels "H","L","LM","UM": 2 3 3 3 3 3 3 4 4 4 ...
## $ reg : Factor w/ 7 levels "East Asia & Pacific",..: 7 7 7 2 2 2 2 2 2 2 ...
## $ edu.total : num [1:1843] 2.61 2.69 2.04 3.08 3.12 ...
## $ hlth : num [1:1843] 1.91 3.32 2.54 NA 6.91 ...
## $ mil : num [1:1843] 6.39 3.57 1.87 1.38 1.32 ...
## $ fdi : num [1:1843] 8.79e+08 1.68e+09 -6.46e+09 9.01e+07 1.35e+08 ...
## $ lbr.part : num [1:1843] NA NA NA 38.8 59.6 ...
## $ unemp : num [1:1843] NA NA NA 12.3 15.8 ...
## $ pop.gwth.total: num [1:1843] 3.277 3.711 3.276 -0.622 -0.3 ...
## $ pop.gwth.rural: num [1:1843] 0.921 1.91 1.338 -1.546 -2.169 ...
## $ pop.gwth.urban: num [1:1843] 5.682 5.02 4.312 0.812 2.181 ...
## $ gdp.dflt : num [1:1843] 418.02 19.37 28.17 38.17 3.65 ...
## $ gcf : num [1:1843] 30.5 30.8 17.9 18.1 35.3 ...
## $ trade : num [1:1843] 152.5 121.4 66.4 44.9 68.5 ...
## $ gdp.pc : num [1:1843] 557 4081 2525 1010 1425 ...
Re-evaluate the countries1 set.
mean(is.na(countries1))
## [1] 0.05471628
sum(complete.cases(countries1))
## [1] 937
mean(complete.cases(countries1))
## [1] 0.5084102
On average, each column has 6% missing rate, results in 937 complete data point (i.e. 49%). This can be a sufficient number for the analysis. However, the missing data can induce loss of power due to the reduced sample size, and some other biases depending on which variables is missing.
# complete rate of data by regions
countries1 %>%
mutate(isComplete = complete.cases(.)) %>%
group_by(reg) %>%
summarise(complete.rate = mean(isComplete)) %>%
arrange(desc(complete.rate))
Countries from North America, Sub-Saharan Africa, and South Asia have the highest rate of missing data. We suspect that Sub-Saharan Africa, and South Asia are comparably less accessible regions. We also know that Americans don’t like filling out forms, so their high rate of missing data is understandable as well.
Still, we need to find a way to address this issue. we propose several approaches:
Distribution of the predicted variable pov
ggplot(countries, aes(x = pov)) + geom_histogram(aes(y = ..density..)) +
geom_density(alpha = 0.2, fill = "blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The graph displays a decreasing rate as poverty indicator increasing.
This might not be representative of the current state of poverty in the
world, but of the number presented in our data. For example, more recent
data is likely to be more inclusive than ancient data, when poverty is
more prevalent. We should look at data from the same period.
# number of data point available from 1967 to
# 2021
ggplot(poverty.headcount, aes(x = year)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# pov data from 1998 and 2018
pov.98.18 <- poverty.headcount %>%
filter(year == 1998 | year == 2018)
pov.98.18 %>%
group_by(year) %>%
summarise(sum = n())
ggplot(pov.98.18, aes(x = pov)) + geom_histogram(aes(y = ..density..)) +
geom_density(alpha = 0.2, fill = "blue") + facet_grid(cols = vars(year))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The graph for 1998 has a much gentler slope, meaning poverty was more popular during that time, as predicted from out intuition. What about the general progress of the world?
# Re-import pov and only take special regions
# geographic
geo.regs <- c("EAS", "ECS", "LCN", "MEA", "SAS", "SSF",
"WLD")
# economics
eco.regs <- c("HIC", "LIC", "LMC", "LMY", "UMC")
pov.reg <- importWDI("../data/poverty.headcount.215dollar.csv",
"pov") %>%
filter(country.code %in% c(geo.regs, eco.regs)) %>%
mutate(type = ifelse(country.code %in% geo.regs,
"Geographic", "Economics"))
## New names:
## Rows: 266 Columns: 67
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): Country Name, Country Code, Indicator Name, Indicator Code dbl (50): 1967,
## 1969, 1971, 1974, 1975, 1977, 1978, 1979, 1980, 1981, 1982, ... lgl (13): 1960,
## 1961, 1962, 1963, 1964, 1965, 1966, 1968, 1970, 1972, 1973, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...67`
pov.reg %>%
distinct(country.code, country.name, type) %>%
arrange(type)
# World
ggplot(pov.reg %>%
filter(country.code == "WLD"), aes(x = year, y = pov)) +
geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
An overall very steady decrease of poverty. How about each region?
ggplot(pov.reg %>%
filter(country.code != "WLD"), aes(x = year, y = pov,
color = country.name)) + geom_point() + geom_smooth() +
facet_grid(rows = vars(type))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
There’s a general steady, but distinct decline of poverty over time in each type region of respective type. Latin America & Caribbean, Europe & Central Asia, Middle East & North Africa, and High Income group has a more gradual decline as they are not very poor to begin with.
Among the income groups, Low & Middle Income, Lower & Middle Income, and Upper Middle Income have quite similar in term of poverty indicator and slope over the year. While these values vary greatly among different geographical regions.
Let’s see some important statistics
stats <- poverty.headcount %>%
summarise(count = n(), skewness = skewness(pov),
kurtosis = kurtosis(pov), std.deviation = sd(pov))
kable(stats)
| count | skewness | kurtosis | std.deviation |
|---|---|---|---|
| 2322 | 1.598182 | 1.661128 | 19.48582 |
We can conduct some preliminary checks on linearity and correlation
of predictors to have a better picture of the data. Checklist
1. Linear relationship:
Use correlation matrix to check linearity
# select only numeric data
countries.num <- countries %>%
select(where(is.numeric))
chart.Correlation(countries.num, histogram = TRUE,
pch = 19)
Which is utterly intelligible, but a majority of the fitted lines are
linear at first glance. We should render separate graphs for the
relationship between
pov & other variables.
# lengthen data table to variable-value pairs,
# with pov as predicted variable and others as
# predictive
countries.num2 <- countries.num %>%
pivot_longer(cols = 3:ncol(.), names_to = "variable",
values_to = "value") %>%
filter(!is.na(value) & !is.na(pov))
drawGraph <- function(indvar, data) {
ggplot(data %>%
filter(variable == indvar), aes(x = value,
y = pov)) + geom_point() + geom_smooth(method = lm) +
labs(title = paste("Relationship pov ~", indvar),
x = indvar, y = "pov")
}
ivs <- distinct(countries.num2, variable)[[1]]
for (indvar in ivs) {
print(ggplot(countries.num2 %>%
filter(variable == indvar), aes(x = value,
y = pov)) + geom_point() + geom_smooth(method = lm) +
labs(title = paste("Relationship pov ~", indvar),
x = indvar, y = "pov"))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Most variables display linear relationship with some exceptions, which are more appropriate to assume a logarithmic relationship. Gender equality should be viewed as a categorical data.
logIvs <- c("fdi", "gdp.dflt", "gdp.pc")
for (indvar in logIvs) {
print(ggplot(countries.num2 %>%
filter(variable == indvar), aes(x = log(value),
y = pov)) + geom_point() + geom_smooth(method = lm) +
labs(title = paste0("Relationship pov ~ log(",
indvar, ")"), x = paste0("log(", indvar,
")"), y = "pov"))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
The relationship are more linear after the transformation.
gdr.eql <- countries.num2 %>%
filter(variable == "gdr.eql") %>%
mutate(value = factor(value))
ggplot(gdr.eql, aes(x = value, y = pov)) + geom_boxplot() +
geom_smooth(method = lm) + labs(title = paste("Relationship pov ~ gdr.eql"),
x = "gdr.eql", y = "pov")
## `geom_smooth()` using formula 'y ~ x'
There’s a significant correlation between gender equality and
poverty.
Correlation coefficients between pov and predictors.
# transform some variables
countries.num3 <- countries.num %>%
mutate(lfdi = log(fdi), lgdp.dflt = log(gdp.dflt),
lgdp.pc = log(gdp.pc)) %>%
mutate(lfdi = ifelse(is.nan(lfdi) | lfdi == -Inf,
NA, lfdi))
# name of the independent variables
cn <- colnames(countries.num3)[-c(1, 2)]
# correlation of independent variable and pov
corWithPov <- function(indevar, data) {
cor(data[[indevar]], data[["pov"]], use = "complete.obs")
}
cor.pov <- sapply(cn, corWithPov, data = countries.num3)
kable(cor.pov)
| x | |
|---|---|
| mpi | 0.4036012 |
| edu.total | -0.3098112 |
| edu.pri | 0.4711269 |
| edu.sec | -0.3172901 |
| edu.ter | -0.1284782 |
| hlth | -0.2938428 |
| mil | -0.0069375 |
| fdi | -0.1441670 |
| lbr.part | 0.2316754 |
| unemp | -0.0947166 |
| pop.gwth.total | 0.5290581 |
| pop.gwth.rural | 0.4943156 |
| pop.gwth.urban | 0.5866539 |
| gdp.dflt | 0.0457111 |
| gdr.eql | -0.4505635 |
| gcf | -0.1108507 |
| trade | -0.2387164 |
| gdp.pc | -0.3773475 |
| lfdi | -0.4875988 |
| lgdp.dflt | 0.2992969 |
| lgdp.pc | -0.7053998 |
edu.ter, mil, fdi,
lbr.part, unemp, gdp.dflt,
gcf have negligible correlation with pov.
lgdp.pc, lfdi, and lgdp.dflt have
stronger linear relationship with pov than their
un-transformed counterparts.
countries1 <- countries1 %>%
mutate(lfdi = log(fdi), lgdp.dflt = log(gdp.dflt),
lgdp.pc = log(gdp.pc)) %>%
mutate(lfdi = ifelse(is.nan(lfdi) | lfdi == -Inf,
NA, lfdi))
2. Predictors are independent There should be no correlation/multicolinearity between each pair of predictors.
countries.arr <- simplify2array(countries.num %>%
select(-c("year", "pov")))
cor.mtx <- rcorr(countries.arr, type = "spearman")
round(cor.mtx$r, 2)
## mpi edu.total edu.pri edu.sec edu.ter hlth mil fdi
## mpi 1.00 -0.49 0.29 -0.10 -0.28 -0.43 0.13 -0.13
## edu.total -0.49 1.00 -0.33 0.12 0.31 0.48 -0.08 0.12
## edu.pri 0.29 -0.33 1.00 -0.62 -0.44 -0.36 0.06 -0.24
## edu.sec -0.10 0.12 -0.62 1.00 0.02 0.26 0.04 0.34
## edu.ter -0.28 0.31 -0.44 0.02 1.00 0.30 0.04 0.19
## hlth -0.43 0.48 -0.36 0.26 0.30 1.00 -0.03 0.22
## mil 0.13 -0.08 0.06 0.04 0.04 -0.03 1.00 0.07
## fdi -0.13 0.12 -0.24 0.34 0.19 0.22 0.07 1.00
## lbr.part -0.41 0.01 0.35 -0.33 -0.04 -0.11 0.02 0.08
## unemp 0.40 0.00 -0.14 0.22 0.00 0.22 0.14 -0.04
## pop.gwth.total -0.14 -0.14 0.60 -0.45 -0.11 -0.29 -0.05 -0.27
## pop.gwth.rural -0.02 -0.16 0.29 -0.18 -0.10 -0.26 0.04 -0.32
## pop.gwth.urban -0.06 -0.21 0.68 -0.51 -0.16 -0.37 -0.07 -0.27
## gdp.dflt 0.19 -0.19 0.22 -0.17 -0.15 -0.38 0.01 -0.22
## gdr.eql -0.43 0.44 -0.50 0.41 -0.21 0.39 -0.11 0.08
## gcf -0.04 -0.10 -0.12 0.01 0.08 -0.21 -0.05 0.04
## trade -0.24 0.30 -0.35 0.08 0.12 0.09 -0.34 -0.09
## gdp.pc -0.66 0.47 -0.53 0.38 0.39 0.59 -0.06 0.59
## lbr.part unemp pop.gwth.total pop.gwth.rural pop.gwth.urban
## mpi -0.41 0.40 -0.14 -0.02 -0.06
## edu.total 0.01 0.00 -0.14 -0.16 -0.21
## edu.pri 0.35 -0.14 0.60 0.29 0.68
## edu.sec -0.33 0.22 -0.45 -0.18 -0.51
## edu.ter -0.04 0.00 -0.11 -0.10 -0.16
## hlth -0.11 0.22 -0.29 -0.26 -0.37
## mil 0.02 0.14 -0.05 0.04 -0.07
## fdi 0.08 -0.04 -0.27 -0.32 -0.27
## lbr.part 1.00 -0.41 0.24 0.15 0.25
## unemp -0.41 1.00 -0.27 -0.16 -0.30
## pop.gwth.total 0.24 -0.27 1.00 0.68 0.91
## pop.gwth.rural 0.15 -0.16 0.68 1.00 0.46
## pop.gwth.urban 0.25 -0.30 0.91 0.46 1.00
## gdp.dflt 0.05 -0.11 0.22 0.11 0.23
## gdr.eql -0.08 0.27 -0.58 -0.55 -0.53
## gcf 0.01 -0.17 -0.06 0.01 0.00
## trade -0.14 -0.03 -0.28 -0.13 -0.29
## gdp.pc -0.04 0.02 -0.44 -0.43 -0.50
## gdp.dflt gdr.eql gcf trade gdp.pc
## mpi 0.19 -0.43 -0.04 -0.24 -0.66
## edu.total -0.19 0.44 -0.10 0.30 0.47
## edu.pri 0.22 -0.50 -0.12 -0.35 -0.53
## edu.sec -0.17 0.41 0.01 0.08 0.38
## edu.ter -0.15 -0.21 0.08 0.12 0.39
## hlth -0.38 0.39 -0.21 0.09 0.59
## mil 0.01 -0.11 -0.05 -0.34 -0.06
## fdi -0.22 0.08 0.04 -0.09 0.59
## lbr.part 0.05 -0.08 0.01 -0.14 -0.04
## unemp -0.11 0.27 -0.17 -0.03 0.02
## pop.gwth.total 0.22 -0.58 -0.06 -0.28 -0.44
## pop.gwth.rural 0.11 -0.55 0.01 -0.13 -0.43
## pop.gwth.urban 0.23 -0.53 0.00 -0.29 -0.50
## gdp.dflt 1.00 0.18 0.09 -0.20 -0.49
## gdr.eql 0.18 1.00 0.30 0.45 0.33
## gcf 0.09 0.30 1.00 0.21 -0.01
## trade -0.20 0.45 0.21 1.00 0.24
## gdp.pc -0.49 0.33 -0.01 0.24 1.00
corrplot(cor.mtx$r, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
Some significant correlations can be found between
gdr.eql
with edu.pri, pop.gwth.total, and
fdi with mil, etc. We expect these variables
to be eliminated in the VIF test. This might be a good property for
imputation (3.2.4)
We conduct a normal linear regression, following the approaches mentioned above to address missing values issues.
1. What is imputation
Imputation is a technique to handle missing values by replacing missing
data with substitute values. In our study, we focus on “item
imputation”, which means substituting for a component of a data point
(i.e. a variable). The general idea is to take a value that preserve the
property of the data (e.g., distribution, mean, standard deviation), or
predict the value using other variables.
We use Multivariate Imputation By Chained Equations (MICE) as our primary tool for this technique, because of its wide acceptance in scientific studies (Alruhaymi and Kim (2021)). In order to use this technique, we have to assume that the “missingness” of a field can be explained by the values in other columns, (e.g., If the countries is in North America, it’s more likely to be missing as we have discovered in 2.2). The general ideas of the algorithm is iteratively predict the missing values based on other variables, and improve it after each iterations until predicted value converse to a stable point. The algorithmic details of MICE is very concisely explained in Gopalan (2020).
Our parameter for this algorithm is fairly standard:
m = 5 imputed data setsmaxit = 30 iterationsThe imputation method is cart (Classification and
Regression Trees).
set.seed(1984)
# split data
isComplete <- which(complete.cases(countries1))
idx <- sample(isComplete, replace = F, 0.3 * nrow(countries1))
countries.train.4 <- countries1[-idx, ]
countries.test.4 <- countries1[idx, ]
# blank for no imputation, cart for variable
# requiring imputation
meth <- c(rep("", 4), "cart", "", rep("cart", 16))
names(meth) <- colnames(countries1)
meth
## country.code country.name year pov income
## "" "" "" "" "cart"
## reg edu.total hlth mil fdi
## "" "cart" "cart" "cart" "cart"
## lbr.part unemp pop.gwth.total pop.gwth.rural pop.gwth.urban
## "cart" "cart" "cart" "cart" "cart"
## gdp.dflt gcf trade gdp.pc lfdi
## "cart" "cart" "cart" "cart" "cart"
## lgdp.dflt lgdp.pc
## "cart" "cart"
# run the algorithm countries1.imputed <-
# mice(countries1, m = 3, maxit = 20, method =
# meth) saveRDS(countries1.imputed,
# 'countries1.imputed.RData')
countries1.imputed <- readRDS("countries1.imputed.RData")
summary(countries1.imputed)
## Class: mids
## Number of multiple imputations: 3
## Imputation methods:
## country.code country.name year pov income
## "" "" "" "" "cart"
## reg edu.total hlth mil fdi
## "" "cart" "cart" "cart" "cart"
## lbr.part unemp pop.gwth.total pop.gwth.rural pop.gwth.urban
## "cart" "cart" "" "cart" "cart"
## gdp.dflt gcf trade gdp.pc lfdi
## "cart" "cart" "cart" "cart" "cart"
## lgdp.dflt lgdp.pc
## "cart" "cart"
## PredictorMatrix:
## country.code country.name year pov income reg edu.total hlth mil
## country.code 0 0 1 1 1 1 1 1 1
## country.name 1 0 1 1 1 1 1 1 1
## year 1 0 0 1 1 1 1 1 1
## pov 1 0 1 0 1 1 1 1 1
## income 1 0 1 1 0 1 1 1 1
## reg 1 0 1 1 1 0 1 1 1
## fdi lbr.part unemp pop.gwth.total pop.gwth.rural pop.gwth.urban
## country.code 1 1 1 1 1 1
## country.name 1 1 1 1 1 1
## year 1 1 1 1 1 1
## pov 1 1 1 1 1 1
## income 1 1 1 1 1 1
## reg 1 1 1 1 1 1
## gdp.dflt gcf trade gdp.pc lfdi lgdp.dflt lgdp.pc
## country.code 1 1 1 1 1 1 1
## country.name 1 1 1 1 1 1 1
## year 1 1 1 1 1 1 1
## pov 1 1 1 1 1 1 1
## income 1 1 1 1 1 1 1
## reg 1 1 1 1 1 1 1
## Number of logged events: 961
## it im dep meth
## 1 0 0 constant
## 2 1 1 income cart
## 3 1 1 edu.total cart
## 4 1 1 hlth cart
## 5 1 1 mil cart
## 6 1 1 fdi cart
## out
## 1 country.name
## 2 country.codeEAS, country.codeECS, country.codeFCS, country.codeHIC, country.codeIDA, country.codeLCN, country.codeLIC, country.codeLMC, country.codeLMY, country.codeMEA, country.codeNRU, country.codeSAS, country.codeSSF, country.codeUMC, country.codeWLD, country.codeAFG, country.codeABW, country.codeAND, country.codeANT, country.codeASM, country.codeATG, country.codeBHR, country.codeBHS, country.codeBMU, country.codeBRB, country.codeBRN, country.codeCHI, country.codeCSK, country.codeCUB, country.codeCUW, country.codeCYM, country.codeDMA, country.codeERI, country.codeFRO, country.codeGIB, country.codeGNQ, country.codeGRD, country.codeGRL, country.codeGUM, country.codeHKG, country.codeIMN, country.codeKHM, country.codeKNA, country.codeKWT, country.codeLBY, country.codeLIE, country.codeMAC, country.codeMAF, country.codeMCO, country.codeMNP, country.codeMYT, country.codeNCL, country.codeNZL, country.codeOMN, country.codePLW, country.codePRI, country.codePRK, country.codePYF, country.codeQAT, country.codeSAU, country.codeSGP, country.codeSMR, country.codeSUN, country.codeSXM, country.codeTCA, country.codeTWN, country.codeVCT, country.codeVGB, country.codeVIR, country.codeYUG, country.codeYUGf, country.codeAFE, country.codeAFW, country.codeARB, country.codeCEB, country.codeCSS, country.codeEAP, country.codeEAR, country.codeECA, country.codeEMU, country.codeEUU, country.codeHPC, country.codeIBD, country.codeIBT, country.codeIDB, country.codeIDX, country.codeLAC, country.codeLDC, country.codeLTE, country.codeMIC, country.codeMNA, country.codeNAC, country.codeOED, country.codeOSS, country.codePRE, country.codePST, country.codeSSA, country.codeSST, country.codeTEA, country.codeTEC, country.codeTLA, country.codeTMN, country.codeTSA, country.codeTSS, country.codePSS, regEurope & Central Asia, regLatin America & Caribbean, regNorth America, regSouth Asia, regSub-Saharan Africa, edu.total
## 3 country.codeARE, country.codeBIH, country.codeCOD, country.codeDZA, country.codeEAS, country.codeECS, country.codeFCS, country.codeFSM, country.codeGUY, country.codeHIC, country.codeHTI, country.codeIDA, country.codeIRQ, country.codeLCN, country.codeLIC, country.codeLMC, country.codeLMY, country.codeMEA, country.codeMKD, country.codeMNE, country.codeNGA, country.codeNRU, country.codePNG, country.codeSAS, country.codeSOM, country.codeSSF, country.codeSUR, country.codeTKM, country.codeTTO, country.codeTUV, country.codeUMC, country.codeUZB, country.codeWLD, country.codeXKX, country.codeYEM, country.codeAFG, country.codeABW, country.codeAND, country.codeANT, country.codeASM, country.codeATG, country.codeBHR, country.codeBHS, country.codeBMU, country.codeBRB, country.codeBRN, country.codeCHI, country.codeCSK, country.codeCUB, country.codeCUW, country.codeCYM, country.codeDMA, country.codeERI, country.codeFRO, country.codeGIB, country.codeGNQ, country.codeGRD, country.codeGRL, country.codeGUM, country.codeHKG, country.codeIMN, country.codeKHM, country.codeKNA, country.codeKWT, country.codeLBY, country.codeLIE, country.codeMAC, country.codeMAF, country.codeMCO, country.codeMNP, country.codeMYT, country.codeNCL, country.codeNZL, country.codeOMN, country.codePLW, country.codePRI, country.codePRK, country.codePYF, country.codeQAT, country.codeSAU, country.codeSGP, country.codeSMR, country.codeSUN, country.codeSXM, country.codeTCA, country.codeTWN, country.codeVCT, country.codeVGB, country.codeVIR, country.codeYUG, country.codeYUGf, country.codeAFE, country.codeAFW, country.codeARB, country.codeCEB, country.codeCSS, country.codeEAP, country.codeEAR, country.codeECA, country.codeEMU, country.codeEUU, country.codeHPC, country.codeIBD, country.codeIBT, country.codeIDB, country.codeIDX, country.codeLAC, country.codeLDC, country.codeLTE, country.codeMIC, country.codeMNA, country.codeNAC, country.codeOED, country.codeOSS, country.codePRE, country.codePST, country.codeSSA, country.codeSST, country.codeTEA, country.codeTEC, country.codeTLA, country.codeTMN, country.codeTSA, country.codeTSS, country.codePSS, regEurope & Central Asia, regLatin America & Caribbean, regMiddle East & North Africa, regNorth America, hlth, fdi
## 4 country.codeBLZ, country.codeEAS, country.codeECS, country.codeFCS, country.codeGUY, country.codeHIC, country.codeIDA, country.codeLCN, country.codeLIC, country.codeLMC, country.codeLMY, country.codeMEA, country.codePSE, country.codeSAS, country.codeSOM, country.codeSSD, country.codeSSF, country.codeSUR, country.codeTKM, country.codeTTO, country.codeUMC, country.codeUSA, country.codeWLD, country.codeXKX, country.codeAFG, country.codeABW, country.codeAND, country.codeANT, country.codeASM, country.codeATG, country.codeBHR, country.codeBHS, country.codeBMU, country.codeBRB, country.codeBRN, country.codeCHI, country.codeCSK, country.codeCUB, country.codeCUW, country.codeCYM, country.codeDMA, country.codeERI, country.codeFRO, country.codeGIB, country.codeGNQ, country.codeGRD, country.codeGRL, country.codeGUM, country.codeHKG, country.codeIMN, country.codeKHM, country.codeKNA, country.codeKWT, country.codeLBY, country.codeLIE, country.codeMAC, country.codeMAF, country.codeMCO, country.codeMNP, country.codeMYT, country.codeNCL, country.codeNZL, country.codeOMN, country.codePLW, country.codePRI, country.codePRK, country.codePYF, country.codeQAT, country.codeSAU, country.codeSGP, country.codeSMR, country.codeSUN, country.codeSXM, country.codeTCA, country.codeTWN, country.codeVCT, country.codeVGB, country.codeVIR, country.codeYUG, country.codeYUGf, country.codeAFE, country.codeAFW, country.codeARB, country.codeCEB, country.codeCSS, country.codeEAP, country.codeEAR, country.codeECA, country.codeEMU, country.codeEUU, country.codeHPC, country.codeIBD, country.codeIBT, country.codeIDB, country.codeIDX, country.codeLAC, country.codeLDC, country.codeLTE, country.codeMIC, country.codeMNA, country.codeNAC, country.codeOED, country.codeOSS, country.codePRE, country.codePST, country.codeSSA, country.codeSST, country.codeTEA, country.codeTEC, country.codeTLA, country.codeTMN, country.codeTSA, country.codeTSS, country.codePSS, regEurope & Central Asia, regLatin America & Caribbean, regNorth America, edu.total, mil
## 5 country.codeBTN, country.codeCOM, country.codeDJI, country.codeEAS, country.codeECS, country.codeFCS, country.codeFSM, country.codeHIC, country.codeHTI, country.codeIDA, country.codeISL, country.codeKIR, country.codeLCA, country.codeLCN, country.codeLIC, country.codeLMC, country.codeLMY, country.codeMDV, country.codeMEA, country.codeMHL, country.codeNRU, country.codePSE, country.codeSAS, country.codeSLB, country.codeSOM, country.codeSSF, country.codeSTP, country.codeSUR, country.codeTON, country.codeTTO, country.codeTUV, country.codeUMC, country.codeVUT, country.codeWLD, country.codeWSM, country.codeAFG, country.codeABW, country.codeAND, country.codeANT, country.codeASM, country.codeATG, country.codeBHR, country.codeBHS, country.codeBMU, country.codeBRB, country.codeBRN, country.codeCHI, country.codeCSK, country.codeCUB, country.codeCUW, country.codeCYM, country.codeDMA, country.codeERI, country.codeFRO, country.codeGIB, country.codeGNQ, country.codeGRD, country.codeGRL, country.codeGUM, country.codeHKG, country.codeIMN, country.codeKHM, country.codeKNA, country.codeKWT, country.codeLBY, country.codeLIE, country.codeMAC, country.codeMAF, country.codeMCO, country.codeMNP, country.codeMYT, country.codeNCL, country.codeNZL, country.codeOMN, country.codePLW, country.codePRI, country.codePRK, country.codePYF, country.codeQAT, country.codeSAU, country.codeSGP, country.codeSMR, country.codeSUN, country.codeSXM, country.codeTCA, country.codeTWN, country.codeVCT, country.codeVGB, country.codeVIR, country.codeYUG, country.codeYUGf, country.codeAFE, country.codeAFW, country.codeARB, country.codeCEB, country.codeCSS, country.codeEAP, country.codeEAR, country.codeECA, country.codeEMU, country.codeEUU, country.codeHPC, country.codeIBD, country.codeIBT, country.codeIDB, country.codeIDX, country.codeLAC, country.codeLDC, country.codeLTE, country.codeMIC, country.codeMNA, country.codeNAC, country.codeOED, country.codeOSS, country.codePRE, country.codePST, country.codeSSA, country.codeSST, country.codeTEA, country.codeTEC, country.codeTLA, country.codeTMN, country.codeTSA, country.codeTSS, country.codePSS, regEurope & Central Asia, regLatin America & Caribbean, regMiddle East & North Africa, regSub-Saharan Africa, edu.total, hlth
## 6 country.codeEAS, country.codeECS, country.codeFCS, country.codeHIC, country.codeIDA, country.codeLCN, country.codeLIC, country.codeLMC, country.codeLMY, country.codeMEA, country.codeNRU, country.codeSAS, country.codeSSF, country.codeUMC, country.codeUSA, country.codeWLD, country.codeAFG, country.codeABW, country.codeAND, country.codeANT, country.codeASM, country.codeATG, country.codeBHR, country.codeBHS, country.codeBMU, country.codeBRB, country.codeBRN, country.codeCHI, country.codeCSK, country.codeCUB, country.codeCUW, country.codeCYM, country.codeDMA, country.codeERI, country.codeFRO, country.codeGIB, country.codeGNQ, country.codeGRD, country.codeGRL, country.codeGUM, country.codeHKG, country.codeIMN, country.codeKHM, country.codeKNA, country.codeKWT, country.codeLBY, country.codeLIE, country.codeMAC, country.codeMAF, country.codeMCO, country.codeMNP, country.codeMYT, country.codeNCL, country.codeNZL, country.codeOMN, country.codePLW, country.codePRI, country.codePRK, country.codePYF, country.codeQAT, country.codeSAU, country.codeSGP, country.codeSMR, country.codeSUN, country.codeSXM, country.codeTCA, country.codeTWN, country.codeVCT, country.codeVGB, country.codeVIR, country.codeYUG, country.codeYUGf, country.codeAFE, country.codeAFW, country.codeARB, country.codeCEB, country.codeCSS, country.codeEAP, country.codeEAR, country.codeECA, country.codeEMU, country.codeEUU, country.codeHPC, country.codeIBD, country.codeIBT, country.codeIDB, country.codeIDX, country.codeLAC, country.codeLDC, country.codeLTE, country.codeMIC, country.codeMNA, country.codeNAC, country.codeOED, country.codeOSS, country.codePRE, country.codePST, country.codeSSA, country.codeSST, country.codeTEA, country.codeTEC, country.codeTLA, country.codeTMN, country.codeTSA, country.codeTSS, country.codePSS, regEurope & Central Asia, regMiddle East & North Africa, regSouth Asia, edu.total, mil
We can take a look into one of the imputated data sets.
countries1.imputed.1 <- complete(countries1.imputed,
1)
summary(countries1.imputed.1)
## country.code country.name year pov income
## BRA : 36 Length:1843 Min. :1967 Min. : 0.00 H :623
## CRI : 34 Class :character 1st Qu.:2001 1st Qu.: 0.20 L :261
## ARG : 32 Mode :character Median :2008 Median : 1.50 LM:509
## USA : 32 Mean :2007 Mean :10.04 UM:450
## HND : 30 3rd Qu.:2014 3rd Qu.:11.60
## GBR : 29 Max. :2021 Max. :91.50
## (Other):1650
## reg edu.total hlth
## East Asia & Pacific :167 Min. : 1.033 Min. : 1.718
## Europe & Central Asia :845 1st Qu.: 3.437 1st Qu.: 4.980
## Latin America & Caribbean :416 Median : 4.388 Median : 6.771
## Middle East & North Africa:104 Mean : 4.482 Mean : 6.813
## North America : 50 3rd Qu.: 5.376 3rd Qu.: 8.463
## South Asia : 53 Max. :15.750 Max. :17.733
## Sub-Saharan Africa :208
## mil fdi lbr.part unemp
## Min. : 0.000 Min. :-3.444e+11 Min. :30.50 Min. : 0.250
## 1st Qu.: 1.040 1st Qu.: 2.982e+08 1st Qu.:56.31 1st Qu.: 4.415
## Median : 1.465 Median : 1.702e+09 Median :61.46 Median : 6.770
## Mean : 1.784 Mean : 1.613e+10 Mean :61.14 Mean : 8.083
## 3rd Qu.: 2.100 3rd Qu.: 9.793e+09 3rd Qu.:65.93 3rd Qu.:10.075
## Max. :19.385 Max. : 7.338e+11 Max. :93.00 Max. :49.700
##
## pop.gwth.total pop.gwth.rural pop.gwth.urban gdp.dflt
## Min. :-3.6295 Min. :-8.560655 Min. :-4.078 Min. : -26.300
## 1st Qu.: 0.2836 1st Qu.:-0.846827 1st Qu.: 0.517 1st Qu.: 1.734
## Median : 1.0378 Median :-0.021085 Median : 1.485 Median : 3.890
## Mean : 1.0639 Mean : 0.005301 Mean : 1.692 Mean : 16.129
## 3rd Qu.: 1.7764 3rd Qu.: 0.959723 3rd Qu.: 2.652 3rd Qu.: 8.606
## Max. : 5.6145 Max. : 4.596858 Max. :13.805 Max. :3333.585
##
## gcf trade gdp.pc lfdi
## Min. : 0.00 Min. : 1.378 Min. : 119.7 Min. : 6.908
## 1st Qu.:19.63 1st Qu.: 50.622 1st Qu.: 1904.4 1st Qu.:19.513
## Median :22.71 Median : 72.606 Median : 5913.4 Median :21.255
## Mean :23.89 Mean : 83.509 Mean : 14983.6 Mean :20.767
## 3rd Qu.:26.78 3rd Qu.:104.706 3rd Qu.: 20863.0 3rd Qu.:23.008
## Max. :69.48 Max. :380.104 Max. :123678.7 Max. :27.322
##
## lgdp.dflt lgdp.pc
## Min. :-2.9224 Min. : 4.785
## 1st Qu.: 0.5504 1st Qu.: 7.552
## Median : 1.3584 Median : 8.685
## Mean : 1.2262 Mean : 8.666
## 3rd Qu.: 2.1506 3rd Qu.: 9.946
## Max. : 8.1118 Max. :11.725
##
sum(!complete.cases(countries1.imputed.1))
## [1] 0
We found no missing cases as expected.
We generated 5 sets of imputated (training) data. We can build our model on each of these data sets and combine the estimates using pooling rule. With all the generated data sets.
formula.str <- "pov ~ reg+year+income+edu.total+hlth+mil+fdi+lbr.part+unemp+pop.gwth.total+pop.gwth.rural+pop.gwth.urban+gdp.dflt+gcf+trade+gdp.pc+lfdi+lgdp.dflt+lgdp.pc"
# Using only the first imputed data set
fit1 <- lm(as.formula(formula.str), data = complete(countries1.imputed,
1))
summary(fit1)
##
## Call:
## lm(formula = as.formula(formula.str), data = complete(countries1.imputed,
## 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.530 -3.070 -0.009 2.574 54.008
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.962e+02 4.793e+01 6.179 7.93e-10 ***
## regEurope & Central Asia -1.643e+00 8.692e-01 -1.891 0.058846 .
## regLatin America & Caribbean 5.615e-02 8.680e-01 0.065 0.948428
## regMiddle East & North Africa -9.028e-01 1.208e+00 -0.747 0.454985
## regNorth America -4.258e+00 1.584e+00 -2.689 0.007242 **
## regSouth Asia -6.935e-01 1.342e+00 -0.517 0.605483
## regSub-Saharan Africa 1.658e+01 1.021e+00 16.245 < 2e-16 ***
## year -1.056e-01 2.437e-02 -4.332 1.56e-05 ***
## incomeL -4.909e+00 1.866e+00 -2.631 0.008581 **
## incomeLM -1.090e+01 1.207e+00 -9.027 < 2e-16 ***
## incomeUM -6.180e+00 8.523e-01 -7.251 6.11e-13 ***
## edu.total -7.838e-01 1.383e-01 -5.668 1.68e-08 ***
## hlth 3.293e-01 1.120e-01 2.939 0.003333 **
## mil -7.389e-01 1.663e-01 -4.443 9.42e-06 ***
## fdi 6.119e-12 4.776e-12 1.281 0.200284
## lbr.part 2.008e-01 2.584e-02 7.772 1.29e-14 ***
## unemp 3.551e-02 3.940e-02 0.901 0.367623
## pop.gwth.total -2.332e+00 5.959e-01 -3.914 9.42e-05 ***
## pop.gwth.rural 8.943e-01 2.360e-01 3.790 0.000156 ***
## pop.gwth.urban 2.048e+00 3.438e-01 5.958 3.07e-09 ***
## gdp.dflt -7.882e-04 1.574e-03 -0.501 0.616586
## gcf -1.128e-01 2.944e-02 -3.831 0.000132 ***
## trade -1.437e-02 4.743e-03 -3.030 0.002478 **
## gdp.pc 2.323e-04 2.353e-05 9.872 < 2e-16 ***
## lfdi 1.315e-03 6.836e-02 0.019 0.984660
## lgdp.dflt 1.905e-01 1.481e-01 1.286 0.198547
## lgdp.pc -9.332e+00 5.635e-01 -16.561 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.035 on 1816 degrees of freedom
## Multiple R-squared: 0.793, Adjusted R-squared: 0.79
## F-statistic: 267.6 on 26 and 1816 DF, p-value: < 2.2e-16
fit1.pred <- predict(fit1, countries.test.4)
# Build models with all generated data set and
# pool the estimates
fit2 <- with(data = countries1.imputed, exp = lm(as.formula(formula.str)))
fit2.combined <- pool(fit2)
summary(fit2.combined)
# dummy lm model
fit2.dummy <- lm(as.formula(formula.str), data = countries1)
# replace coefficients of dummy model & predict
fit2.dummy$coefficients <- fit2.combined$pooled$estimate
fit2.pred <- predict(fit2.dummy, countries.test.4)
Helper function to calculate R-Squared
r2 <- function(pred, orig) {
RSS <- sum((pred - orig)^2)
TSS <- sum((pred - mean(orig))^2)
R2 <- 1 - RSS/TSS
return(R2)
}
adjR2 <- function(pred, orig, k) {
R2 <- r2(pred, orig)
n <- length(pred)
adjr2 <- 1 - (1 - R2) * (n - 1)/(n - k - 1)
return(adjr2)
}
Adjusted R-Squared on train and test
k <- length(fit1$coefficients) - 1
fit1.train.r2 <- summary(fit1)$adj.r.squared
fit1.test.r2 <- adjR2(fit1.pred, countries.test.4$pov,
k)
fit2.train.r2 <- pool.r.squared(fit2, adjusted = T)[,
"est"]
fit2.test.r2 <- adjR2(fit2.pred, countries.test.4$pov,
k)
res <- data.frame(model = c("Single", "Pooled"), train = c(fit1.train.r2,
fit2.train.r2), test = c(fit1.test.r2, fit2.test.r2))
kable(res)
| model | train | test |
|---|---|---|
| Single | 0.7900486 | 0.5732851 |
| Pooled | 0.7858952 | 0.5743992 |
The models seem to be over-fitting. We should perform model simplification.
Prepare data
sd0 <- function(vct) {
if (!is.numeric(vct)) {
return(NA)
}
return(sd(vct, na.rm = T))
}
std0 <- function(vct, scl) {
if (!is.numeric(vct)) {
return(vct)
}
return(vct/scl)
}
standardise <- function(train, test) {
data <- rbind(train, test)
scaler <- unlist(lapply(data, sd0))
numCols <- which(unlist(lapply(data, is.numeric)))
num <- as.data.frame(mapply(std0, vct = data[,
numCols], scl = scaler[numCols], SIMPLIFY = T))
fct <- data[, -numCols]
return(cbind(fct, num))
}
optimalModel <- function(train) {
# standardized
}
We can conduct a step-wise AIC variable selection. It is similar to the procedure we use in class, but based on a metric call AIC (Akaike Information Criterion), which is an estimator of prediction error and relative quality of statistical models. The lower AIC is, the better the model fits. \ We have no idea how to pool coefficients for AIC model,
# helper
fitAIC <- function(i) {
fit <- lm(as.formula(formula.str), complete(countries1.imputed,
i))
aic.fit <- stepAIC(fit, trace = F, direction = "backward")
return(aic.fit)
}
# build models
aic.fits <- lapply(1:countries1.imputed$m, fitAIC)
# predict with each model
aic.pred <- lapply(aic.fits, predict, newdata = countries.test.4)
# calculate adj r2
aic.train.r2 <- unlist(lapply(aic.fits, function(model) summary(model)$adj.r.squared))
k <- lapply(aic.fits, function(m) length(m$coefficients))
aic.test.r2 <- unlist(mapply(adjR2, aic.pred, k = k,
MoreArgs = list(orig = countries.test.4$pov)))
res <- data.frame(`set no.` = 1:3, train = aic.train.r2,
test = aic.test.r2)
Adjusted R-Squared
kable(res)
| set.no. | train | test |
|---|---|---|
| 1 | 0.7901813 | 0.5745707 |
| 2 | 0.7865726 | 0.5630834 |
| 3 | 0.7815808 | 0.5853173 |
-> AIC on one model link
-> stepwise selection on mice link
Check assumption outliers #### 3.2.4.3. Interpretation